# Alexander F. Gazmararian
# afg2@princeton.edu
#January 9, 2024

#Purpose: This file computes statistics describing how the study site compares to coal country

#Load Packages
library(here)
library(tidyverse)
library(tidylog)
library(kableExtra)

#Function for latex tables
source(here("code", "fun", "fix_txt.R"))

# Load sample data
g <- readRDS(here("data", "output", "matched_df.rds"))
#Subset to treated counties
g <- subset(g, treat == 1)

#Load data on mines
prod <- readRDS(here("data", "inter", "coalmines.rds"))
prod <- subset(prod, select = c(fips, calendar_yr, annual_coal_prod))
#Aggregate
prod <- prod %>%
  rename(year = calendar_yr) %>%
  group_by(fips, year) %>%
  summarize(annual_coal_prod = sum(annual_coal_prod, na.rm = TRUE)) %>%
  mutate(fips = as.numeric(fips))
#Merge
g <- left_join(g, prod, by = c("fips", "year"))
# Create indicator for interview counties, and interview state
g <- g %>%
  mutate(
    interview_county = case_when(
      county == "greene" & state == "Pennsylvania" ~ 1,
      T ~ 0
    ),
    interview_state = as.integer(state == "Pennsylvania")
  )
# Aggregate to the relevant level
g.agg <- g %>%
  group_by(fips, post_shock, interview_county, interview_state) %>%
  summarise(
    across(
      c(coal, annual_coal_prod, oilgas, coal_prop, oilgas_prop, NewGasDistance, NewGas, RepVotesMajorPercent, pop, white, hispanic, foreign, college, income99pclog, poverty, 
        rural_nonfarm, poplog, under40, work_female),
      ~ mean(.x, na.rm = TRUE)
    )
  ) %>%
  rename(gop = RepVotesMajorPercent) %>%
  mutate(annual_coal_prod = log(annual_coal_prod+1),
        NewGasDistance = log(NewGasDistance))

# Pivot wider
g.agg <- g.agg %>%
  pivot_wider(names_from = post_shock, values_from = c(coal:gop))
  
g.agg <- g.agg %>%
  mutate(
    coal_chg = coal_1 - coal_0,
    prod_chg = annual_coal_prod_1 - annual_coal_prod_0,
    coalprop_chg = coal_prop_1 - coal_prop_0,
    oilgas_chg = oilgas_1 - oilgas_0,
    gop_chg = gop_1 - gop_0,
    NewGasDistance_chg = NewGasDistance_1 - NewGasDistance_0
  )

tab.theory <-
  g.agg %>%
  group_by(interview_county) %>%
  summarize(across(c(prod_chg, annual_coal_prod_0, annual_coal_prod_1, coal_chg, coal_prop_0, coal_prop_1,
                     oilgas_chg, oilgas_prop_0, oilgas_prop_1,
                     NewGasDistance_chg, NewGasDistance_0, NewGasDistance_1,
                     gop_chg, gop_0, gop_1,
                     white, hispanic, foreign, college, income99pclog, poverty,
                     rural_nonfarm, poplog, under40, work_female), ~ mean(.x, na.rm = TRUE))) %>%
  t() %>%
  data.frame()


tab.pa <-
  g.agg %>%
  filter(interview_state == 1) %>%
  ungroup() %>%
  summarize(across(c(prod_chg, annual_coal_prod_0, annual_coal_prod_1, coal_chg, coal_prop_0, coal_prop_1,
                     oilgas_chg, oilgas_prop_0, oilgas_prop_1,
                     NewGasDistance_chg, NewGasDistance_0, NewGasDistance_1,
                     gop_chg, gop_0, gop_1,
                     white, hispanic, foreign, college, income99pclog, poverty,
                     rural_nonfarm, poplog, under40, work_female), ~ mean(.x, na.rm = TRUE))) %>%
  t() %>%
  data.frame()
tab.pa

tab.theory <- tab.theory[-1,]
tab.theory <- cbind(tab.theory, tab.pa)

names(tab.theory) <- c("All", "Field", "State")
tab.theory <- tab.theory[, c(2,3,1)]

rownames(tab.theory) <- c(
  "Coal Production (log) $\\Delta_{Post-Pre}$",
  "Pre-Shock Coal Production (log)",
  "Post-Shock Coal Production (log)",
  "Coal Employment $\\Delta_{Post-Pre}$",
  "Pre-Shock Coal Employment (\\%)",
  "Post-Shock Coal Employment (\\%)",
  "Hydraulic Fracturing Employment $\\Delta_{Post-Pre}$",
  "Pre-Shock Hydraulic Fracturing Employment (\\%)",
  "Post-Shock Hydraulic Fracturing Employment (\\%)",
  "Distance to Gas Power Plant (log) $\\Delta_{Post-Pre}$",
  "Pre-Shock Distance to Gas Power Plant (log)",
  "Post-Shock Distance to Gas Power Plant (log)",
  "GOP Vote Share $\\Delta_{Post-Pre}$",
  "Pre-Shock GOP Vote Share (\\%)",
  "Post-Shock GOP Vote Share (\\%)",
  "White",
  "Hispanic",
  "Foreign-born",
  "College",
  "Income per capita (log)",
  "Poverty",
  "Rural",
  "Population (log)",
  "Under 40 years",
  "Female workforce"
  )

tab_sum <- here("output", "tables", "si_tab_interview_summarystats.txt")
kbl(
  tab.theory,
  booktabs = TRUE,
  escape = FALSE,
  digits = 2,
  format = "latex"
) %>%
  group_rows(group_label = "Economic Effects of Shale Shock", start_row = 1, end_row = 9) %>%
  group_rows(group_label = "Visibility", start_row = 10, end_row = 12) %>%
  group_rows(group_label = "Outcome", start_row = 13, end_row = 15) %>%
  group_rows(group_label = "Socio-Demographics", start_row = 16, end_row = 25) %>%
  cat(., file = tab_sum)
